#############################################################################################
## Replication code for:                                                                   ##
## A Male Hostility Spiral? Polarized Communication among Political Elites on Social Media ##
## Step 5.                                                                                 ##
#############################################################################################

library(tidyverse)
library(yardstick)
library(xtable)
library(quanteda)
options(scipen=999)
set.seed(42)

data <- read.csv("data1.csv")

data$inparty <- as.numeric(data$party_id_sender==data$party_id_receiver)

data <- data %>%
  mutate(inparty_pf = if_else(
    is.na(sender_parfam) | is.na(receiver_parfam),
    NA_integer_,
    as.integer(sender_parfam == receiver_parfam)
  ))

data$inparty_govopp  <- ifelse((data$sender_ingov==1&data$receiver_ingov==1)|data$inparty==1,1,0)


data$predicted_sentiment <- recode(data$predicted_sentiment,
                                   "Negative" = -1,
                                   "Neutral" = 0,
                                   "Positive" = 1)

data$predicted_targeted_sentiment <- recode(data$predicted_targeted_sentiment,
                                            "Negative" = -1,
                                            "Neutral" = 0,
                                            "Positive" = 1)

# Prediction accuracy across languages

annotated_sample <- read.csv("annotated_data.csv")

languages <- annotated_sample %>% group_by(language) %>% summarise(n=n()) %>% filter(n>30)

annotated_sample <- annotated_sample[annotated_sample$language%in%languages$language,]
annotated_sample$predicted_sentiment <- factor(annotated_sample$predicted_sentiment)
annotated_sample$annotated_sentiment <- factor(annotated_sample$annotated_sentiment)
annotated_sample$predicted_targeted_sentiment <- factor(annotated_sample$predicted_targeted_sentiment)
annotated_sample$annotated_targeted_sentiment <- factor(annotated_sample$annotated_targeted_sentiment)

metrics_df <- annotated_sample %>%
  group_by(language) %>%
  summarise(
    accuracy_sentiment = mean(annotated_sentiment == predicted_sentiment,na.rm=T),
    `accuracy_targeted sentiment` = mean(annotated_targeted_sentiment == predicted_targeted_sentiment,na.rm=T),
    f1_sentiment = f_meas_vec(truth = annotated_sentiment, estimate = predicted_sentiment, estimator = "macro"),
    `f1_targeted sentiment` = f_meas_vec(truth = annotated_targeted_sentiment, estimate = predicted_targeted_sentiment, estimator = "macro"),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(accuracy_sentiment, `accuracy_targeted sentiment`, f1_sentiment, `f1_targeted sentiment`),
               names_to = c("metric", "category"),
               names_sep = "_",
               values_to = "value")

metrics_tot <- annotated_sample %>%
  summarise(
    accuracy_sentiment = mean(annotated_sentiment == predicted_sentiment,na.rm=T),
    `accuracy_targeted sentiment` = mean(annotated_targeted_sentiment == predicted_targeted_sentiment,na.rm=T),
    f1_sentiment = f_meas_vec(truth = annotated_sentiment, estimate = predicted_sentiment, estimator = "macro"),
    `f1_targeted sentiment` = f_meas_vec(truth = annotated_targeted_sentiment, estimate = predicted_targeted_sentiment, estimator = "macro"),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(accuracy_sentiment, `accuracy_targeted sentiment`, f1_sentiment, `f1_targeted sentiment`),
               names_to = c("metric", "category"),
               names_sep = "_",
               values_to = "value")
metrics_tot$language <- "overall"
metrics_tot$group <- "overall"
metrics_tot$n = 1376

t <- annotated_sample %>% group_by(language) %>% summarise(n=n())
metrics_df <- merge(metrics_df,t,by="language")
metrics_df$group <- "language subgroups"
metrics_df <- rbind(metrics_tot,metrics_df)


png(file="tables_and_figures/fg5_oa.png",
    width=16, height=10, units="cm", res=800)
ggplot(metrics_df, aes(x = language, y = value, color = metric)) +
  geom_point(position = position_dodge(width = 0.5)) +
  geom_point(aes(size=n),position = position_dodge(width = 0.5),show.legend=F) +
  theme_bw() +
  facet_grid(category ~ group,scales="free_x",space="free_x")+
  ylab("")+
  theme(legend.position = "bottom")
dev.off()

# Correlation with emotion measure

targ_sent_model <- lm(predicted_targeted_sentiment~anger_v2+fear_v2+disgust_v2+sadness_v2+
                        joy_v2+enthusiasm_v2+pride_v2+hope_v2,data=data)

sent_model <- lm(predicted_sentiment~anger_v2+fear_v2+disgust_v2+sadness_v2+
                   joy_v2+enthusiasm_v2+pride_v2+hope_v2,data=data)


coef_targ_sent_model <- data.frame(
  term = names(coef(targ_sent_model)[-1]),  # Exclude intercept
  estimate = coef(targ_sent_model)[-1],
  model = "Targeted Sentiment"
)

coef_sent_model <- data.frame(
  term = names(coef(sent_model)[-1]),
  estimate = coef(sent_model)[-1],
  model = "Sentiment"
)

coef_data <- rbind(coef_targ_sent_model, coef_sent_model)
coef_data$term <- factor(coef_data$term)
levels(coef_data$term) <- c("Anger","Disgust","Enthusiasm","Fear","Hope","Joy","Pride","Sadness")

png(file="tables_and_figures/fg2.png",
    width=14, height=6, units="cm", res=800)
ggplot(coef_data, aes(x = reorder(term, estimate), y = estimate)) +
  geom_bar(stat = "identity", color = "black") +
  facet_grid(~model) +
  coord_flip() +
  labs(
    x = "Emotion",
    y = "Coefficient Value"
  ) +
  theme_bw()
dev.off()

# Fightin'Words Figure

# Code partially from https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html

get_words <- function(zeta,n=10){
  zeta <- data.frame(t(zeta))
  zeta$words <- row.names(zeta)
  colnames(zeta) <- c("Positive","Negative")
  zeta$Negative<-NULL
  zeta <- zeta[order(zeta$Positive),]
  pos <- tail(zeta,n=n)
  neg <- head(zeta,n=n)
  res <- rbind(pos,neg)
  return(res)  
}

fwgroups <- function(dtm, groups, pair = NULL, weights = rep(1,nrow(dtm)), k.prior = .1) {
  
  weights[is.na(weights)] <- 0
  
  weights <- weights/mean(weights)
  
  zero.doc <- rowSums(dtm)==0 | weights==0
  zero.term <- colSums(dtm[!zero.doc,])==0
  
  print("Initialize")
  dtm.nz <- apply(dtm[!zero.doc,!zero.term],2,"*", weights[!zero.doc])
  print("Pre Prior")
  g.prior <- tcrossprod(rowSums(dtm.nz),colSums(dtm.nz))/sum(dtm.nz)
  print("Prior")
  
  # 
  
  g.posterior <- as.matrix(dtm.nz + k.prior*g.prior)
  print("Posterior")
  groups <- groups[!zero.doc]
  groups <- droplevels(groups)
  
  g.adtm <- as.matrix(aggregate(x=g.posterior,by=list(groups=groups),FUN=sum)[,-1])
  rownames(g.adtm) <- levels(groups)
  print("g.adt")
  g.ladtm <- log(g.adtm)
  
  g.delta <- t(scale( t(scale(g.ladtm, center=T, scale=F)), center=T, scale=F))
  print("g.delta")
  g.adtm_w <- -sweep(g.adtm,1,rowSums(g.adtm)) # terms not w spoken by k
  g.adtm_k <- -sweep(g.adtm,2,colSums(g.adtm)) # w spoken by groups other than k
  g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k 
  print("gadtm_kw")
  g.se <- sqrt(1/g.adtm + 1/g.adtm_w + 1/g.adtm_k + 1/g.adtm_kw)
  
  g.zeta <- g.delta/g.se
  
  g.counts <- as.matrix(aggregate(x=dtm.nz, by = list(groups=groups), FUN=sum)[,-1])
  print("g.counts")
  if (!is.null(pair)) {
    pr.delta <- t(scale( t(scale(g.ladtm[pair,], center = T, scale =F)), center=T, scale=F))
    pr.adtm_w <- -sweep(g.adtm[pair,],1,rowSums(g.adtm[pair,]))
    pr.adtm_k <- -sweep(g.adtm[pair,],2,colSums(g.adtm[pair,])) # w spoken by groups other than k
    pr.adtm_kw <- sum(g.adtm[pair,]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
    pr.se <- sqrt(1/g.adtm[pair,] + 1/pr.adtm_w + 1/pr.adtm_k + 1/pr.adtm_kw)
    pr.zeta <- pr.delta/pr.se
    
    return(list(zeta=pr.zeta[1,], delta=pr.delta[1,],se=pr.se[1,], counts = colSums(dtm.nz), acounts = colSums(g.adtm)))
  } else {
    return(list(zeta=g.zeta,delta=g.delta,se=g.se,counts=g.counts,acounts=g.adtm))
  }
}

get_fw_data <- function(data,c_id, n_words){
  
  cntry <- data %>%
    filter(country_id %in% c_id&predicted_targeted_sentiment!=0)
  cntry <- cntry[c("text_en","predicted_targeted_sentiment")]
  
  cntry$text <- gsub("\\s*@\\w+", "", cntry$text_en)
  cntry$text <- tolower(cntry$text)
  cntry_dfm <- dfm(tokens(cntry$text,
                          remove_punct=T,
                          remove_symbols = T,
                          remove_url = T))
  
  cntry_dfm <- dfm_trim(cntry_dfm,min_docfreq = 0.005, max_docfreq=0.01,docfreq_type = "prop")
  fw_res <- fwgroups(cntry_dfm,groups=factor(cntry$predicted_targeted_sentiment))
  
  words <-  get_words(fw_res$zeta)
  return(words)
}

res_UK <- get_fw_data(data=data,c_id=24,n_words=10) # NOTE: Requires "text"
res_UK$cntry <- "UK"
res_SE <- get_fw_data(data=data,c_id=22,n_words=10)
res_SE$cntry <- "SE"
res_IT <- get_fw_data(data=data,c_id=12,n_words=10)
res_IT$cntry <- "IT"
res_ES <- get_fw_data(data=data,c_id=21,n_words=10)
res_ES$cntry <- "ES"
res_FR <- get_fw_data(data=data,c_id=7,n_words=10)
res_FR$cntry <- "FR"

res <- rbind(res_UK,res_ES,res_FR,res_IT,res_SE)

colnames(res)[2] <- "words"
# png(file="figures/fg1.png",
#     width=15, height=8, units="cm", res=800)
ggplot(res, aes(y = reorder(words, Positive),x=factor(1), fill = Positive, label = words)) +
  geom_tile(aes(width = Inf), height = 1) + 
  geom_text(col="white",size=3) +
  theme_bw() +
  labs(fill = "Zeta") +
  theme(axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank(),
        axis.title.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(),
        legend.position="bottom")+
  facet_wrap(~cntry,scales="free",ncol=5)
# dev.off()

# Summary Statistics

table <- data %>% group_by(country) %>% summarise(n_senders = length(unique(Sender_id)),
                                                  perc_female_senders   = n_distinct(Sender_id[sender_gender == "female"])/n_distinct(Sender_id),
                                                  n_receivers = length(unique(Receiver_id)),
                                                  perc_female_receiver   = n_distinct(Receiver_id[receiver_gender == "female"])/n_distinct(Receiver_id),
                                                  n_posts = n(),
                                                  perc_posts_to_women = mean(receiver_gender=="female",na.rm=T),
                                                  perc_posts_from_women = mean(sender_gender=="female",na.rm=T))
print(xtable(table), type = "html", file = "tables_and_figures/data_summary_table.html")

# Average sentiment across gender

full_data_mean <- data %>%
  group_by(inparty) %>%
  summarise(mean_value = mean(predicted_targeted_sentiment, na.rm = TRUE)) %>%
  mutate(group = "All MPs")

male_data_mean <- data %>%
  filter(receiver_gender == "male", sender_gender == "male") %>%
  group_by(inparty) %>%
  summarise(mean_value = mean(predicted_targeted_sentiment, na.rm = TRUE)) %>%
  mutate(group = "Male MPs only")

female_data_mean <- data %>%
  filter(receiver_gender == "female", sender_gender == "female") %>%
  group_by(inparty) %>%
  summarise(mean_value = mean(predicted_targeted_sentiment, na.rm = TRUE)) %>%
  mutate(group = "Female MPs only")

combined_means <- bind_rows(full_data_mean, male_data_mean, female_data_mean)

combined_means$mean_value <- round(combined_means$mean_value,digits=4)

differences <- combined_means %>%
  spread(inparty, mean_value) %>%
  mutate(difference = abs(`0` - `1`)) %>%
  dplyr::select(group, difference)

combined_means <- combined_means %>%
  left_join(differences, by = "group")

combined_means$inparty <- factor(combined_means$inparty)
levels(combined_means$inparty) <- c("Outparty","Inparty")
colnames(combined_means)[1] <- "Inparty/Outparty"

png(file="tables_and_figures/fg3.png",
    width=15, height=8, units="cm", res=600)
ggplot(combined_means, aes(x=group, y=mean_value, fill=`Inparty/Outparty`)) +
  geom_bar(stat="identity", position="dodge") +
  facet_wrap(~group, scales="free_x") +
  theme_bw() +
  labs(y="Targeted Sentiment", x="Data subset") +
  theme(legend.position="bottom")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
dev.off()

t.test(
  predicted_targeted_sentiment ~ inparty, 
  data = data, 
  var.equal = TRUE
)

# Key variable summary

summary_stats <- data %>%
  mutate(
    gender_sender = ifelse(sender_gender == "male", 1, 0),
    gender_receiver = ifelse(receiver_gender == "male", 1, 0)
  ) %>%
  summarise(
    mean_inparty = mean(inparty, na.rm = TRUE),
    sd_inparty = sd(inparty, na.rm = TRUE),
    min_inparty = min(inparty, na.rm = TRUE),
    max_inparty = max(inparty, na.rm = TRUE),
    
    mean_inparty_pf = mean(inparty_pf, na.rm = TRUE),
    sd_inparty_pf = sd(inparty_pf, na.rm = TRUE),
    min_inparty_pf = min(inparty_pf, na.rm = TRUE),
    max_inparty_pf = max(inparty_pf, na.rm = TRUE),
    
    mean_inparty_govopp = mean(inparty_govopp, na.rm = TRUE),
    sd_inparty_govopp = sd(inparty_govopp, na.rm = TRUE),
    min_inparty_govopp = min(inparty_govopp, na.rm = TRUE),
    max_inparty_govopp = max(inparty_govopp, na.rm = TRUE),
    
    mean_gender_sender = mean(gender_sender, na.rm = TRUE),
    sd_gender_sender = sd(gender_sender, na.rm = TRUE),
    min_gender_sender = min(gender_sender, na.rm = TRUE),
    max_gender_sender = max(gender_sender, na.rm = TRUE),
    
    mean_gender_receiver = mean(gender_receiver, na.rm = TRUE),
    sd_gender_receiver = sd(gender_receiver, na.rm = TRUE),
    min_gender_receiver = min(gender_receiver, na.rm = TRUE),
    max_gender_receiver = max(gender_receiver, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "stat", values_to = "value") %>%
  separate(stat, into = c("metric", "variable"), sep = "_", extra = "merge") %>%
  pivot_wider(names_from = metric, values_from = value)

colnames(summary_stats) <- c("Variable", "Mean", "SD", "Min", "Max")

print(xtable(summary_stats, digits = 3),
      type = "html",
      include.rownames = FALSE,
      file = "tables_and_figures/tbl1.html")

# Government Opposition percentages

gov_opp_dynamics <- data %>%
  filter(!(is.na(sender_ingov) | is.na(receiver_ingov))) %>%
  group_by(sender_ingov, receiver_ingov, predicted_targeted_sentiment) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(predicted_targeted_sentiment) %>%
  mutate(
    percent = (n / sum(n)) * 100,
    relative_fill = percent / max(percent)
  )
gov_opp_dynamics$sender_ingov <- factor(gov_opp_dynamics$sender_ingov)
gov_opp_dynamics$receiver_ingov <- factor(gov_opp_dynamics$receiver_ingov)
levels(gov_opp_dynamics$sender_ingov) <- c("Opposition", "Government")
levels(gov_opp_dynamics$receiver_ingov) <- c("Opposition", "Government")

png(file="tables_and_figures/fg1_oa.png",
    width=18, height=8, units="cm", res=800)
ggplot(gov_opp_dynamics, aes(x = receiver_ingov, y = sender_ingov, fill = relative_fill)) +
  geom_tile() +
  geom_text(aes(label = sprintf("%.1f%%", percent)), color = "black", size = 4) + # Add percentages as labels
  scale_fill_gradient(low = "white", high = "blue", name = "Percent (%)") +
  facet_wrap(~ predicted_targeted_sentiment) +
  theme_bw() +
  xlab("Receiver") +
  ylab("Sender") +
  theme(legend.position = "none")
dev.off()

# Party Family percentages

data$receiver_parfam <- factor(data$receiver_parfam,levels=c("Socialist","Social Democratic","Ecological",
                                                             "Agrarian","Liberal","Conservative","Christian Democratic","Nationalist"))

data$sender_parfam <- factor(data$sender_parfam,levels=c("Socialist","Social Democratic","Ecological",
                                                         "Agrarian","Liberal","Conservative","Christian Democratic","Nationalist"))

parfam_dynamics <- data %>% 
  filter(!(is.na(sender_parfam)|is.na(receiver_parfam))) %>% 
  group_by(sender_parfam,receiver_parfam,predicted_targeted_sentiment) %>% 
  summarise(n=n(), .groups="drop") %>%
  group_by(predicted_targeted_sentiment) %>%
  mutate(
    percent = (n / sum(n)) * 100,
    relative_fill = percent / max(percent)
  )

png(file="tables_and_figures/fg2_oa.png",
    width=35, height=15, units="cm", res=800)
ggplot(parfam_dynamics, aes(x = receiver_parfam, y = sender_parfam, fill = relative_fill)) +
  geom_tile() +
  geom_text(aes(label = sprintf("%.1f%%", percent)), color = "black", size = 4) + # Add percentages as labels
  scale_fill_gradient(low = "white", high = "blue", name = "Count (n)") +
  facet_wrap(~ predicted_targeted_sentiment,scales="free") +
  theme_bw()+
  xlab("Receiver")+
  ylab("Sender")+
  theme(legend.position="None")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()


# Post distributions

hostility_dist <- data %>% group_by(sender) %>% summarise(neg = sum(predicted_targeted_sentiment==-1,na.rm=T),
                                                          pos = sum(predicted_targeted_sentiment==+1,na.rm=T),
                                                          n_tot = n(),
                                                          gender = unique(sender_gender))

hostility_long <- hostility_dist %>%
  pivot_longer(cols = c(neg, pos), names_to = "type", values_to = "value")

hostility_long <- hostility_long %>%
  mutate(
    bin = cut(
      value,
      breaks = c(-1, 0, 1, 5, 10, 50, 100, Inf),
      labels = c(0,"1","2-5","6-10","11-50","51-100","101+"),
      right = TRUE
    )
  )

hostility_long$type <- factor(hostility_long$type)
levels(hostility_long$type) <- c("Negative","Positive")

png(file="tables_and_figures/fg3_oa.png",
    width=20, height=10, units="cm", res=800)
ggplot(hostility_long, aes(x = bin, fill = gender)) +
  geom_bar(position = "dodge", alpha = 0.8, color = "black") +
  facet_wrap(~type) +
  theme_bw() +
  scale_fill_manual(values = c("female" = "lightpink", "male" = "lightblue")) +
  labs(
    x = "Number of Posts",
    y = "Frequency",
    fill = "Gender"
  )+
  theme(legend.position = "bottom")
dev.off()

# Cumulative post distributions

distribution_data <- hostility_long %>% group_by(type,gender) %>%
  arrange(desc(value),.by_group = T) %>%
  mutate(cumulative_value = cumsum(value)) %>%
  mutate(ranked_position = row_number()) %>%
  ungroup() %>%
  group_by(type) %>%
  mutate(total_posts = sum(value)) %>%
  ungroup() %>%
  group_by(gender) %>%
  mutate(total_pol = length(unique(sender))) %>%
  ungroup() %>%
  mutate(cumulative_perc = cumulative_value/total_posts,
         cumulative_position = ranked_position/total_pol)
distribution_data$gender <- factor(distribution_data$gender)
levels(distribution_data$gender) <- c("Female","Male")
colnames(distribution_data)[3] <- "Gender"
distribution_data$type <- factor(distribution_data$type)
levels(distribution_data$type) <- c("Negative posts","Positive posts")
distribution_data$cumulative_perc <- round(distribution_data$cumulative_perc*100,digits=1)
distribution_data$cumulative_position <- round(distribution_data$cumulative_position*100,digits=1)

png(file="tables_and_figures/fg4_oa.png",
    width=20, height=10, units="cm", res=800)
ggplot(distribution_data,aes(x=cumulative_position,y=cumulative_perc,group=Gender,col=Gender))+
  geom_line()+
  facet_wrap(~type)+
  theme_bw()+
  xlab("% of politicians")+
  ylab("% of posts")+
  theme(legend.position="bottom")
dev.off()

# Party Family statistics

parfam_table <- data %>% filter(!is.na(sender_parfam)) %>%
  group_by(sender_parfam) %>% summarise(n_senders = length(unique(Sender_id)),
                                                  perc_female_senders   = n_distinct(Sender_id[sender_gender == "female"])/n_distinct(Sender_id),
                                                  n_receivers = length(unique(Receiver_id)),
                                                  perc_female_receiver   = n_distinct(Receiver_id[receiver_gender == "female"])/n_distinct(Receiver_id),
                                                  n_posts = n(),
                                                  perc_posts_to_women = mean(receiver_gender=="female",na.rm=T),
                                                  perc_posts_from_women = mean(sender_gender=="female",na.rm=T))

print(xtable(parfam_table, digits = 3),
      type = "html",
      include.rownames = FALSE,
      file = "tables_and_figures/tbl2_oa.html")